library(tidyverse)  # data manipulation
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(cluster)    # clustering algorithms
library(factoextra) # clustering algorithms & visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(gProfileR)

Treshold FDR 5% log FC > 1 or < -1 (1204 genes)

load("~/Documents/MiGASti/Databases/df_num_scale_LOGFC1stim.Rdata")
load("~/Documents/MiGASti/Databases/Kmeans1204gsupport.Rdata")
df <- df_num_scale
#scale data:As we don?t want the clustering algorithm to depend to an arbitrary variable unit, we start by scaling/standardizing 
df3 <- scale(df)

Elbow method

set.seed(123)
fviz_nbclust(df3, kmeans, method = "wss")

Average silhouette approach

fviz_nbclust(df3, kmeans, method = "silhouette")

Gap statistic clustering

# compute gap statistic
set.seed(123)
gap_stat <- clusGap(df3, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = df3, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 2
##           logW   E.logW       gap      SE.sim
##  [1,] 6.943107 7.882240 0.9391329 0.006428239
##  [2,] 6.809108 7.755145 0.9460373 0.006539340
##  [3,] 6.735218 7.677794 0.9425759 0.006013863
##  [4,] 6.667009 7.619304 0.9522946 0.006075357
##  [5,] 6.593164 7.582524 0.9893593 0.005630834
##  [6,] 6.542798 7.550159 1.0073609 0.005045045
##  [7,] 6.492002 7.522901 1.0308986 0.005105812
##  [8,] 6.453713 7.497541 1.0438275 0.004866291
##  [9,] 6.420197 7.476286 1.0560882 0.004774224
## [10,] 6.402617 7.456369 1.0537526 0.004633696
fviz_gap_stat(gap_stat)

summary 3 clusters

k3 <- kmeans(df3, centers = 3, nstart = 25)
str(k3)
## List of 9
##  $ cluster     : int [1:1204] 3 3 3 2 2 3 3 3 3 3 ...
##  $ centers     : num [1:3, 1:7] 2.1299 0.0598 -0.3688 -0.1881 0.5394 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:7] "LPS" "IFNy" "IL4" "R848" ...
##  $ totss       : num 8421
##  $ withinss    : num [1:3] 716 1899 3005
##  $ tot.withinss: num 5619
##  $ betweenss   : num 2802
##  $ size        : int [1:3] 118 348 738
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k3, data = df3)

summary 5 clusters

k5 <- kmeans(df3, centers = 5, nstart = 25)
str(k5)
## List of 9
##  $ cluster     : int [1:1204] 5 2 5 3 1 2 2 2 2 5 ...
##  $ centers     : num [1:5, 1:7] -0.0477 -0.7981 0.1702 2.1663 -0.0189 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:5] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:7] "LPS" "IFNy" "IL4" "R848" ...
##  $ totss       : num 8421
##  $ withinss    : num [1:5] 573 999 428 674 1529
##  $ tot.withinss: num 4204
##  $ betweenss   : num 4217
##  $ size        : int [1:5] 226 303 86 114 475
##  $ iter        : int 5
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k5, data = df3)

Compare 2 clusters

summary 2 clusters

k2 <- kmeans(df3, centers = 2, nstart = 25)
str(k2)
## List of 9
##  $ cluster     : int [1:1204] 1 1 2 1 2 1 1 1 1 1 ...
##  $ centers     : num [1:2, 1:7] -0.3417 0.7437 -0.0193 0.0421 0.2855 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:7] "LPS" "IFNy" "IL4" "R848" ...
##  $ totss       : num 8421
##  $ withinss    : num [1:2] 4005 2531
##  $ tot.withinss: num 6536
##  $ betweenss   : num 1885
##  $ size        : int [1:2] 825 379
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k2, data = df3)

spearman correlation coefficient between 2 clusters

#first run
k <- kmeans(df3, centers = 2, nstart = 25)
kmeans <- k$cluster
kmean_download <- as.data.frame(kmeans)
kmeanscor <- cbind(kmean_download, DF_log2FC)
gencode_30 <- read.delim("~/Documents/MiGASti/Databases/ens.geneid.gencode.v30")
colnames(gencode_30) = c("ensembl","symbol")
kmeanscor1 = merge(kmeanscor, gencode_30, by="ensembl")

#second run
k <- kmeans(df3, centers = 2, nstart = 25)
kmeans <- k$cluster
kmean_download <- as.data.frame(kmeans)
kmeanscor <- cbind(kmean_download, DF_log2FC)
kmeanscor2 = merge(kmeanscor, gencode_30, by="ensembl")

res <- cor.test(kmeanscor1$kmeans, kmeanscor2$kmeans, method = "pearson")
res 
## 
##  Pearson's product-moment correlation
## 
## data:  kmeanscor1$kmeans and kmeanscor2$kmeans
## t = -Inf, df = 1202, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -1 -1
## sample estimates:
## cor 
##  -1

first run gprofiler

cluster1 <- kmeanscor1[kmeanscor1$kmeans == 1,]
cluster2 <- kmeanscor1[kmeanscor1$kmeans == 2,]


gprofiler_results_1 <- gprofiler(query = cluster1$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_2 <- gprofiler(query = cluster2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")



head(gprofiler_results_1$term.name)
## [1] "peptidyl-tyrosine modification"                        
## [2] "nucleobase-containing small molecule catabolic process"
## [3] "odontoblast differentiation"                           
## [4] "regulation of phagocytosis"                            
## [5] "positive regulation of phagocytosis"                   
## [6] "circulatory system process"
head(gprofiler_results_2$term.name)
## [1] "response to corticosteroid"            
## [2] "response to glucocorticoid"            
## [3] "neutrophil differentiation"            
## [4] "response to stimulus"                  
## [5] "response to chemical"                  
## [6] "response to oxygen-containing compound"

second run gprofiler

cluster1_2 <- kmeanscor2[kmeanscor2$kmeans == 1,]
cluster2_2 <- kmeanscor2[kmeanscor2$kmeans == 2,]


gprofiler_results_1_2 <- gprofiler(query = cluster1_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_2_2 <- gprofiler(query = cluster2_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")



head(gprofiler_results_1_2$term.name)
## [1] "regulation of protein oligomerization"                
## [2] "positive regulation of protein oligomerization"       
## [3] "sensory perception of sound"                          
## [4] "neuron projection arborization"                       
## [5] "regulation of neuron projection arborization"         
## [6] "positive regulation of neuron projection arborization"
head(gprofiler_results_2_2$term.name)
## [1] "neuron apoptotic process"                              
## [2] "nucleobase-containing small molecule catabolic process"
## [3] "cyclic-nucleotide-mediated signaling"                  
## [4] "negative regulation of coagulation"                    
## [5] "peptidyl-tyrosine autophosphorylation"                 
## [6] "cytosol to ER transport"

Compare 4 clusters

summary 4 clusters

k4 <- kmeans(df3, centers = 4, nstart = 25)
str(k4)
## List of 9
##  $ cluster     : int [1:1204] 4 4 4 2 1 4 4 4 4 4 ...
##  $ centers     : num [1:4, 1:7] -0.0211 0.1702 2.1107 -0.3594 -0.1524 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "1" "2" "3" "4"
##   .. ..$ : chr [1:7] "LPS" "IFNy" "IL4" "R848" ...
##  $ totss       : num 8421
##  $ withinss    : num [1:4] 773 428 726 2908
##  $ tot.withinss: num 4835
##  $ betweenss   : num 3586
##  $ size        : int [1:4] 261 86 121 736
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k4, data = df3)

spearman correlation coefficient between 4 clusters

k <- kmeans(df3, centers = 4, nstart = 25)
kmeans <- k$cluster
kmean_download <- as.data.frame(kmeans)
kmeanscor <- cbind(kmean_download, DF_log2FC)
kmeanscor1 = merge(kmeanscor, gencode_30, by="ensembl")

k <- kmeans(df3, centers = 4, nstart = 25)
kmeans <- k$cluster
kmean_download <- as.data.frame(kmeans)
kmeanscor <- cbind(kmean_download, DF_log2FC)
kmeanscor2 = merge(kmeanscor, gencode_30, by="ensembl")

res <- cor.test(kmeanscor1$kmeans, kmeanscor2$kmeans, method = "pearson")
res 
## 
##  Pearson's product-moment correlation
## 
## data:  kmeanscor1$kmeans and kmeanscor2$kmeans
## t = 100.24, df = 1202, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9386973 0.9508033
## sample estimates:
##       cor 
## 0.9450735

first run gprofiler

cluster1 <- kmeanscor1[kmeanscor1$kmeans == 1,]
cluster2 <- kmeanscor1[kmeanscor1$kmeans == 2,]
cluster3 <- kmeanscor1[kmeanscor1$kmeans == 3,]
cluster4 <- kmeanscor1[kmeanscor1$kmeans == 4,]


gprofiler_results_1 <- gprofiler(query = cluster1$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_2 <- gprofiler(query = cluster2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_3 <- gprofiler(query = cluster3$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_4 <- gprofiler(query = cluster4$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")


head(gprofiler_results_1$term.name)
## [1] "regulation of transcription from RNA polymerase II promoter by histone modification"         
## [2] "negative regulation of transcription by RNA polymerase II"                                   
## [3] "negative regulation of transcription from RNA polymerase II promoter by histone modification"
## [4] "extracellular matrix organization"                                                           
## [5] "phosphatidylcholine biosynthetic process"                                                    
## [6] "purine-containing compound catabolic process"
head(gprofiler_results_2$term.name)
## [1] "regulation of syncytium formation by plasma membrane fusion"
## [2] "regulation of myoblast fusion"                              
## [3] "lipoprotein metabolic process"                              
## [4] "T cell proliferation"                                       
## [5] "regulation of T cell proliferation"                         
## [6] "regulation of cAMP-mediated signaling"
head(gprofiler_results_3$term.name)
## [1] "regulation of endothelial cell development"             
## [2] "regulation of establishment of endothelial barrier"     
## [3] "hepatic immune response"                                
## [4] "ovulation"                                              
## [5] "protein localization to cell leading edge"              
## [6] "regulation of protein localization to cell leading edge"
head(gprofiler_results_4$term.name)
## [1] "FIB-associated protein complex" "RHINO-TopBP1 complex"          
## [3] "PICK1-GRIP1-GLUR2 complex"      "GRASP1-GRIP1 complex"          
## [5] "PlexinA1-NRP1-SEMA3A complex"   "MICA-KLRK1-HCST complex"

second run gprofiler

cluster1_2 <- kmeanscor2[kmeanscor2$kmeans == 1,]
cluster2_2 <- kmeanscor2[kmeanscor2$kmeans == 2,]
cluster3_2 <- kmeanscor2[kmeanscor2$kmeans == 3,]
cluster4_2 <- kmeanscor2[kmeanscor2$kmeans == 4,]



gprofiler_results_1_2 <- gprofiler(query = cluster1_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_2_2 <- gprofiler(query = cluster2_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_3_2 <- gprofiler(query = cluster3_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_4_2 <- gprofiler(query = cluster4_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")


head(gprofiler_results_1_2$term.name)
## [1] "response to calcium ion"                                                            
## [2] "regulation of fatty acid oxidation"                                                 
## [3] "peptidyl-tyrosine modification"                                                     
## [4] "purine-containing compound catabolic process"                                       
## [5] "negative regulation of plasminogen activation"                                      
## [6] "regulation of transcription from RNA polymerase II promoter by histone modification"
head(gprofiler_results_2_2$term.name)
## [1] "luteinizing hormone signaling pathway"                           
## [2] "response to iron ion starvation"                                 
## [3] "protein localization to cell leading edge"                       
## [4] "regulation of protein localization to cell leading edge"         
## [5] "negative regulation of protein localization to cell leading edge"
## [6] "pyrimidine nucleobase transport"
head(gprofiler_results_3_2$term.name)
## [1] "regulation of apoptotic cell clearance"         
## [2] "positive regulation of apoptotic cell clearance"
## [3] "STAT cascade"                                   
## [4] "JAK-STAT cascade"                               
## [5] "negative regulation of cell adhesion"           
## [6] "negative regulation of cell-cell adhesion"
head(gprofiler_results_4_2$term.name)
## [1] "PICK1-GRIP1-GLUR2 complex"          "PlexinA1-NRP1-SEMA3A complex"      
## [3] "TFIIH transcription factor complex" "Nav1.2-beta2 complex"              
## [5] "ITGA2-ITGB1-CD47 complex"           "FIB-associated protein complex"

Compare 6 clusters

summary 6 clusters

k6 <- kmeans(df3, centers = 6, nstart = 25)
str(k6)
## List of 9
##  $ cluster     : int [1:1204] 6 5 6 4 2 6 5 5 5 6 ...
##  $ centers     : num [1:6, 1:7] -0.2668 -0.0329 2.3003 0.1708 -0.8236 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:6] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:7] "LPS" "IFNy" "IL4" "R848" ...
##  $ totss       : num 8421
##  $ withinss    : num [1:6] 574 548 591 435 785 ...
##  $ tot.withinss: num 3803
##  $ betweenss   : num 4618
##  $ size        : int [1:6] 180 222 103 87 253 359
##  $ iter        : int 4
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k6, data = df3)

spearman correlation coefficient between 6 clusters

# first run
k <- kmeans(df3, centers = 6, nstart = 25)
kmeans <- k$cluster
kmean_download <- as.data.frame(kmeans)
kmeanscor <- cbind(kmean_download, DF_log2FC)
kmeanscor1 = merge(kmeanscor, gencode_30, by="ensembl")

# second run 
k <- kmeans(df3, centers = 6, nstart = 25)
kmeans <- k$cluster
kmean_download <- as.data.frame(kmeans)
kmeanscor <- cbind(kmean_download, DF_log2FC)
kmeanscor2 = merge(kmeanscor, gencode_30, by="ensembl")

res <- cor.test(kmeanscor1$kmeans, kmeanscor2$kmeans, method = "pearson")
res 
## 
##  Pearson's product-moment correlation
## 
## data:  kmeanscor1$kmeans and kmeanscor2$kmeans
## t = -2.6365, df = 1202, p-value = 0.008485
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.13175701 -0.01941359
## sample estimates:
##         cor 
## -0.07582593

first run gprofiler

cluster1 <- kmeanscor1[kmeanscor1$kmeans == 1,]
cluster2 <- kmeanscor1[kmeanscor1$kmeans == 2,]
cluster3 <- kmeanscor1[kmeanscor1$kmeans == 3,]
cluster4 <- kmeanscor1[kmeanscor1$kmeans == 4,]
cluster5 <- kmeanscor1[kmeanscor1$kmeans == 5,]
cluster6 <- kmeanscor1[kmeanscor1$kmeans == 6,]

gprofiler_results_1 <- gprofiler(query = cluster1$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_2 <- gprofiler(query = cluster2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_3 <- gprofiler(query = cluster3$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_4 <- gprofiler(query = cluster4$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_5 <- gprofiler(query = cluster5$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_6 <- gprofiler(query = cluster6$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

head(gprofiler_results_1$term.name)
## [1] "protein deubiquitination"                       
## [2] "alpha-beta T cell activation"                   
## [3] "regulation of alpha-beta T cell activation"     
## [4] "CD4-positive, alpha-beta T cell activation"     
## [5] "alpha-beta T cell differentiation"              
## [6] "CD4-positive, alpha-beta T cell differentiation"
head(gprofiler_results_2$term.name)
## [1] "negative regulation of interleukin-12 production"          
## [2] "cellular response to mycotoxin"                            
## [3] "regulation of fatty acid oxidation"                        
## [4] "negative regulation of type I interferon production"       
## [5] "positive regulation of mitotic cell cycle phase transition"
## [6] "negative regulation of chronic inflammatory response"
head(gprofiler_results_3$term.name)
## character(0)
head(gprofiler_results_4$term.name)
## [1] "cellular developmental process" "cell differentiation"          
## [3] "cell development"               "cell activation"               
## [5] "catalytic activity"             "acylglycerol lipase activity"
head(gprofiler_results_5$term.name)
## [1] "modulation by host of viral glycoprotein metabolic process"         
## [2] "negative regulation by host of viral glycoprotein metabolic process"
## [3] "negative regulation of iron ion transport"                          
## [4] "regulation of iron ion transmembrane transport"                     
## [5] "negative regulation of iron ion transmembrane transport"            
## [6] "purine nucleobase transmembrane transport"
head(gprofiler_results_6$term.name)
## [1] "HDAC1-associated core complex cII"  "9-1-1-RHINO complex"               
## [3] "TFIIH transcription factor complex" "ITGA2-ITGB1-CHAD complex"          
## [5] "GRASP1-GRIP1 complex"               "ESR1-GRIP1 complex"

second run gprofiler

cluster1_2 <- kmeanscor2[kmeanscor2$kmeans == 1,]
cluster2_2 <- kmeanscor2[kmeanscor2$kmeans == 2,]
cluster3_2 <- kmeanscor2[kmeanscor2$kmeans == 3,]
cluster4_2 <- kmeanscor2[kmeanscor2$kmeans == 4,]
cluster5_2 <- kmeanscor2[kmeanscor2$kmeans == 5,]
cluster6_2 <- kmeanscor2[kmeanscor2$kmeans == 6,]


gprofiler_results_1_2 <- gprofiler(query = cluster1_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_2_2 <- gprofiler(query = cluster2_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_3_2 <- gprofiler(query = cluster3_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_4_2 <- gprofiler(query = cluster4_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_5_2 <- gprofiler(query = cluster5_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_6_2 <- gprofiler(query = cluster6_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

head(gprofiler_results_1_2$term.name)
## [1] "fused antrum stage"                                                    
## [2] "ovarian cumulus expansion"                                             
## [3] "ovulation"                                                             
## [4] "negative regulation of glucose transmembrane transport"                
## [5] "positive regulation of leukocyte adhesion to arterial endothelial cell"
## [6] "negative regulation of branching involved in lung morphogenesis"
head(gprofiler_results_2_2$term.name)
## [1] "STAT cascade"                                              
## [2] "JAK-STAT cascade"                                          
## [3] "post-translational protein modification"                   
## [4] "I-kappaB kinase/NF-kappaB signaling"                       
## [5] "regulation of I-kappaB kinase/NF-kappaB signaling"         
## [6] "positive regulation of I-kappaB kinase/NF-kappaB signaling"
head(gprofiler_results_3_2$term.name)
## character(0)
head(gprofiler_results_4_2$term.name)
## [1] "PlexinA1-NRP1-SEMA3A complex"     "ITGA2-ITGB1-CHAD complex"        
## [3] "HDAC1-associated protein complex" "HDAC2-asscociated core complex"  
## [5] "ITGA2-ITGB1-COL6A3 complex"       "MICA-KLRK1-HCST complex"
head(gprofiler_results_5_2$term.name)
## [1] "response to acid chemical"                                
## [2] "negative regulation of chronic inflammatory response"     
## [3] "regulation of high voltage-gated calcium channel activity"
## [4] "response to inorganic substance"                          
## [5] "movement of cell or subcellular component"                
## [6] "tissue development"
head(gprofiler_results_6_2$term.name)
## [1] "cellular developmental process" "cell differentiation"          
## [3] "cell development"               "cell activation"               
## [5] "catalytic activity"             "acylglycerol lipase activity"

Compare 7 clusters

summary 7 clusters

k7 <- kmeans(df3, centers = 7, nstart = 25)
str(k3)
## List of 9
##  $ cluster     : int [1:1204] 3 3 3 2 2 3 3 3 3 3 ...
##  $ centers     : num [1:3, 1:7] 2.1299 0.0598 -0.3688 -0.1881 0.5394 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:7] "LPS" "IFNy" "IL4" "R848" ...
##  $ totss       : num 8421
##  $ withinss    : num [1:3] 716 1899 3005
##  $ tot.withinss: num 5619
##  $ betweenss   : num 2802
##  $ size        : int [1:3] 118 348 738
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
fviz_cluster(k7, data = df3)

spearman correlation coefficient between 7 clusters

#first run 
k <- kmeans(df3, centers = 7, nstart = 25)
kmeans <- k$cluster
kmean_download <- as.data.frame(kmeans)
kmeanscor <- cbind(kmean_download, DF_log2FC)
kmeanscor1 = merge(kmeanscor, gencode_30, by="ensembl")

#second run 
k <- kmeans(df3, centers = 7, nstart = 25)
kmeans <- k$cluster
kmean_download <- as.data.frame(kmeans)
kmeanscor <- cbind(kmean_download, DF_log2FC)
kmeanscor2 = merge(kmeanscor, gencode_30, by="ensembl")

res <- cor.test(kmeanscor1$kmeans, kmeanscor2$kmeans, method = "pearson")
res 
## 
##  Pearson's product-moment correlation
## 
## data:  kmeanscor1$kmeans and kmeanscor2$kmeans
## t = 6.1875, df = 1202, p-value = 8.368e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1203924 0.2299064
## sample estimates:
##       cor 
## 0.1756929

first run gprofiler

cluster1 <- kmeanscor1[kmeanscor1$kmeans == 1,]
cluster2 <- kmeanscor1[kmeanscor1$kmeans == 2,]
cluster3 <- kmeanscor1[kmeanscor1$kmeans == 3,]
cluster4 <- kmeanscor1[kmeanscor1$kmeans == 4,]
cluster5 <- kmeanscor1[kmeanscor1$kmeans == 5,]
cluster6 <- kmeanscor1[kmeanscor1$kmeans == 6,]
cluster7 <- kmeanscor1[kmeanscor1$kmeans == 7,]

gprofiler_results_1 <- gprofiler(query = cluster1$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_2 <- gprofiler(query = cluster2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_3 <- gprofiler(query = cluster3$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_4 <- gprofiler(query = cluster4$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_5 <- gprofiler(query = cluster5$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_6 <- gprofiler(query = cluster6$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")
gprofiler_results_7 <- gprofiler(query = cluster7$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

head(gprofiler_results_1$term.name)
## [1] "ammonium ion metabolic process"   "response to stimulus"            
## [3] "tube development"                 "tube morphogenesis"              
## [5] "side of membrane"                 "external side of plasma membrane"
head(gprofiler_results_2$term.name)
## [1] "alpha-beta T cell activation"                   
## [2] "regulation of alpha-beta T cell activation"     
## [3] "CD4-positive, alpha-beta T cell activation"     
## [4] "alpha-beta T cell differentiation"              
## [5] "CD4-positive, alpha-beta T cell differentiation"
## [6] "leukocyte proliferation"
head(gprofiler_results_3$term.name)
## [1] "HDAC1-associated protein complex"                             
## [2] "PlexinA1-NRP1-SEMA3A complex"                                 
## [3] "PICK1-GRIP1-GLUR2 complex"                                    
## [4] "NRD complex (Nucleosome remodeling and deacetylation complex)"
## [5] "FIF-FGR2 complex"                                             
## [6] "ESR1-GRIP1 complex"
head(gprofiler_results_4$term.name)
## [1] "localization of cell"                     
## [2] "regulation of locomotion"                 
## [3] "cell motility"                            
## [4] "regulation of cellular component movement"
## [5] "regulation of cell motility"              
## [6] "regulation of cell migration"
head(gprofiler_results_5$term.name)
## [1] "fused antrum stage"          "ovarian cumulus expansion"  
## [3] "collagen metabolic process"  "collagen catabolic process" 
## [5] "adenosine catabolic process" "xanthine metabolic process"
head(gprofiler_results_6$term.name)
## [1] "regulation of interleukin-2 production"                 
## [2] "muscle cell proliferation"                              
## [3] "smooth muscle cell proliferation"                       
## [4] "regulation of smooth muscle cell proliferation"         
## [5] "negative regulation of smooth muscle cell proliferation"
## [6] "response to wounding"
head(gprofiler_results_7$term.name)
## [1] "cell-cell signaling by wnt"                     
## [2] "Wnt signaling pathway"                          
## [3] "response to calcium ion"                        
## [4] "reactive oxygen species metabolic process"      
## [5] "phosphorus metabolic process"                   
## [6] "phosphate-containing compound metabolic process"

second run gprofiler

cluster1_2 <- kmeanscor2[kmeanscor2$kmeans == 1,]
cluster2_2 <- kmeanscor2[kmeanscor2$kmeans == 2,]
cluster3_2 <- kmeanscor2[kmeanscor2$kmeans == 3,]
cluster4_2 <- kmeanscor2[kmeanscor2$kmeans == 4,]
cluster5_2 <- kmeanscor2[kmeanscor2$kmeans == 5,]
cluster6_2 <- kmeanscor2[kmeanscor2$kmeans == 6,]
cluster7_2 <- kmeanscor2[kmeanscor2$kmeans == 7,]


gprofiler_results_1_2 <- gprofiler(query = cluster1_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_2_2 <- gprofiler(query = cluster2_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_3_2 <- gprofiler(query = cluster3_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_4_2 <- gprofiler(query = cluster4_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_5_2 <- gprofiler(query = cluster5_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_6_2 <- gprofiler(query = cluster6_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

gprofiler_results_7_2 <- gprofiler(query = cluster7_2$symbol,                                   organism = "hsapiens",
                                  ordered_query = F, 
                                  exclude_iea = F, 
                                  max_p_value = 0.05, 
                                  max_set_size = 0,
                                  correction_method = "fdr",
                                  hier_filtering = "none", 
                                  domain_size = "annotated")

head(gprofiler_results_1_2$term.name)
## [1] "NRD complex (Nucleosome remodeling and deacetylation complex)"
## [2] "Nav1.2-beta2 complex"                                         
## [3] "HDAC1-associated protein complex"                             
## [4] "FIF-FGR2 complex"                                             
## [5] "PlexinA1-NRP1-SEMA3A complex"                                 
## [6] "ER-alpha-GRIP1-c-Jun complex"
head(gprofiler_results_2_2$term.name)
## [1] "tube development"                 "tube morphogenesis"              
## [3] "ammonium ion metabolic process"   "response to stimulus"            
## [5] "side of membrane"                 "external side of plasma membrane"
head(gprofiler_results_3_2$term.name)
## [1] "DNA replication initiation"                     
## [2] "phosphorus metabolic process"                   
## [3] "phosphate-containing compound metabolic process"
## [4] "regulation of phosphorus metabolic process"     
## [5] "regulation of phosphate metabolic process"      
## [6] "regulation of molecular function"
head(gprofiler_results_4_2$term.name)
## [1] "swimming behavior"                                               
## [2] "purine nucleobase transmembrane transport"                       
## [3] "positive regulation of smooth muscle contraction"                
## [4] "protein localization to cell leading edge"                       
## [5] "regulation of protein localization to cell leading edge"         
## [6] "negative regulation of protein localization to cell leading edge"
head(gprofiler_results_5_2$term.name)
## [1] "alpha-beta T cell activation"                   
## [2] "regulation of alpha-beta T cell activation"     
## [3] "CD4-positive, alpha-beta T cell activation"     
## [4] "alpha-beta T cell differentiation"              
## [5] "CD4-positive, alpha-beta T cell differentiation"
## [6] "leukocyte proliferation"
head(gprofiler_results_6_2$term.name)
## [1] "parturition"                                         
## [2] "chronic inflammatory response"                       
## [3] "negative regulation of chronic inflammatory response"
## [4] "muscle cell proliferation"                           
## [5] "smooth muscle cell proliferation"                    
## [6] "regulation of smooth muscle cell proliferation"
head(gprofiler_results_7_2$term.name)
## [1] "localization of cell"                     
## [2] "cell motility"                            
## [3] "regulation of locomotion"                 
## [4] "regulation of cellular component movement"
## [5] "regulation of cell motility"              
## [6] "regulation of cell migration"